library(ISLR)
library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels)
library(stringr)
library(splitstackshape)
library(lubridate)
library(rpart.plot)
library(cluster)
library(forcats)
tidymodels_prefer()
library(probably) #install.packages('probably')
library(vip)
imdb_top_1000 <- read_csv("~/Desktop/Statistical Machine Learning/R Files/Final Project/imdb_top_1000_CLEAN.csv")
imdb_clean <- imdb_top_1000 %>%
cSplit("Genre", sep = ",", direction = "wide") %>%
mutate(Gross = log(Revenue - Budget))
runtime_clean <- imdb_top_1000$Runtime %>%
str_replace(" min", "") %>%
as.numeric()
imdb_clean$Runtime <- runtime_clean
imdb_clean <- imdb_clean %>%
filter(Gross != "-Inf") %>%
drop_na(Gross, Budget)
head(imdb_clean)
data_cv10 <- vfold_cv(imdb_clean, v = 10)
# Model Spec
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
# Recipe
full_lm_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score + No_of_Votes +
Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_naomit(Gross, Runtime, IMDB_Rating, Meta_score, No_of_Votes)
# Workflow
full_lm_wf <- workflow() %>%
add_recipe(full_lm_rec) %>%
add_model(lm_spec)
# All predictors included
full_lm_model <- fit(full_lm_wf, data = imdb_clean)
full_lm_model %>%
tidy()
# Cross Validation Performed (10 folds)
full_lm_modelcv <- fit_resamples(full_lm_wf, resamples = data_cv10, metrics = metric_set(rmse,
rsq, mae))
full_lm_modelcv %>%
collect_metrics()
# Lasso Model Spec with tune
lm_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% # mixture = 1 indicates Lasso
set_engine(engine = 'glmnet') %>%
set_mode('regression')
# Recipe
data_rec_lasso <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value (don't want duplicates)
step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # standardization important step for LASSO
step_dummy(all_nominal_predictors()) %>% # creates indicator variables for categorical variables
step_naomit(Gross, Runtime, IMDB_Rating, # omit any NA values
Meta_score, No_of_Votes)
# Workflow
lasso_wf_tune <- workflow() %>%
add_recipe(data_rec_lasso) %>%
add_model(lm_lasso_spec_tune)
# Tune Model (to select best Lambda penalty) -- via Cross Validation
penalty_grid <- grid_regular(penalty(range = c(-3, 1)), levels = 30)
tune_res <- tune_grid(lasso_wf_tune, resamples = data_cv10, metrics = metric_set(rmse,
mae), grid = penalty_grid)
# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_res) + theme_classic()
# Collect CV Metrics and Select Best Model
# Summarize Model Evaluation Metrics (CV)
lasso_mod <- collect_metrics(tune_res) %>%
filter(.metric == "rmse") %>%
select(penalty, rmse = mean)
# Choose penalty value
best_penalty <- select_best(tune_res, metric = "rmse")
lasso_mod
# Fit Final Model -- Subset of Variables
final_wf <- finalize_workflow(lasso_wf_tune, best_penalty) # incorporates penalty value to workflow
final_fit <- fit(final_wf, data = imdb_clean)
tidy(final_fit)
# Final ('best') model predictors and coefficients
final_fit %>%
tidy() %>%
filter(estimate != 0)
# Cross Validation Performed (10 folds)
lasso_fit <- fit_resamples(final_wf, resamples = data_cv10, metrics = metric_set(rmse,
rsq, mae))
lasso_fit %>%
collect_metrics()
lasso_mod_out <- final_fit %>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
ggplot(lasso_mod_out, aes(x = .pred, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + theme_classic()
ggplot(lasso_mod_out, aes(x = Runtime, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + theme_classic()
ggplot(lasso_mod_out, aes(x = IMDB_Rating, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + theme_classic()
ggplot(lasso_mod_out, aes(x = No_of_Votes, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + theme_classic()
# Build the GAM
gam_spec <- gen_additive_mod() %>%
set_engine(engine = "mgcv") %>%
set_mode("regression")
gam_mod1 <- fit(gam_spec, Gross ~ s(Runtime, k = 20) + s(IMDB_Rating) + Meta_score +
s(No_of_Votes) + Genre_1, data = imdb_clean)
# Diagnostics: Check to see if the number of knots is large enough (if p-value
# is low, increase number of knots)
gam_mod1 %>%
pluck("fit") %>%
mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 9 iterations.
## The RMS GCV score gradient at convergence was 3.345103e-06 .
## The Hessian was positive definite.
## Model rank = 51 / 51
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Runtime) 19.00 14.98 0.97 0.24
## s(IMDB_Rating) 9.00 4.25 0.94 0.09 .
## s(No_of_Votes) 9.00 3.99 1.06 0.91
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnostics: Check to see if the number of knots is large enough
gam_mod1 %>%
pluck("fit") %>%
summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Gross ~ s(Runtime, k = 20) + s(IMDB_Rating) + Meta_score + s(No_of_Votes) +
## Genre_1
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.832784 0.429340 39.206 < 2e-16 ***
## Meta_score 0.010241 0.005299 1.932 0.05387 .
## Genre_1Adventure -0.044096 0.247036 -0.179 0.85840
## Genre_1Animation 1.346910 0.301529 4.467 9.81e-06 ***
## Genre_1Biography 0.223135 0.234887 0.950 0.34258
## Genre_1Comedy 0.267256 0.227633 1.174 0.24093
## Genre_1Crime -0.581492 0.232807 -2.498 0.01282 *
## Genre_1Drama -0.334477 0.190120 -1.759 0.07914 .
## Genre_1Family -0.175638 0.962267 -0.183 0.85524
## Genre_1Film-Noir -2.558791 0.976808 -2.620 0.00907 **
## Genre_1Horror 0.512855 0.502375 1.021 0.30781
## Genre_1Mystery -1.175425 0.571692 -2.056 0.04029 *
## Genre_1Thriller -0.520050 1.354114 -0.384 0.70110
## Genre_1Western -0.455014 0.817789 -0.556 0.57819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Runtime) 14.979 17.114 5.202 <2e-16 ***
## s(IMDB_Rating) 4.254 5.211 19.128 <2e-16 ***
## s(No_of_Votes) 3.989 4.958 68.466 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.521 Deviance explained = 55.4%
## GCV = 1.9035 Scale est. = 1.7723 n = 540
# Visualize: Look at the estimated non-linear functions
gam_mod1 %>%
pluck("fit") %>%
plot(main = "Non-Linear Visualizations")
gam1_output <- gam_mod1 %>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
gam1_output %>%
rmse(truth = Gross, estimate = .pred)
gam1_output %>%
rsq(truth = Gross, estimate = .pred)
# Note: Unable to obtain std error for eval metrics bc can't perform cross
# validation w/ TidyModels
spline_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score + No_of_Votes + Genre_1,
data = imdb_clean) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_naomit(Gross) %>%
step_ns(Runtime, deg_free = 20) %>%
step_ns(No_of_Votes, deg_free = 10) %>%
step_ns(IMDB_Rating, deg_free = 10)
# Chose degrees of freedom to attempt to match # of knots from TidyModels May
# not be an exact match, but it's close
spline_rec %>%
prep(imdb_clean) %>%
juice()
# Build the GAM
lm_spec_gam <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
spline_wf <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(spline_rec)
GAM2_fit <- fit(spline_wf, data = imdb_clean)
tidy(GAM2_fit)
# Cross Validation Performed (10 folds)
cv_output_spline2 <- fit_resamples(
spline_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(mae,rmse,rsq)
)
cv_output_spline2 %>% collect_metrics()
# Compare RMSE values --> pick model with lower RMSE
gam1_output %>%
rmse(truth = Gross, estimate = .pred)
cv_output_spline2 %>%
collect_metrics()
The GAM created using TidyModels performs better than the recipe GAM based on RMSE. Likely has to do with the degrees of freedom of the splines. However, it is worth noting that we were unable to perform cross validation on the TidyModels GAM.
# Visualize Residuals for Final GAM (TidyModels GAM)
ggplot(gam1_output, aes(x = .pred, y = resid)) + geom_point() + geom_smooth() + geom_hline(yintercept = 0,
color = "red") + labs(x = "Prediction", y = "Residuals", title = "Residuals vs. Predictions") +
theme_classic()
ggplot(gam1_output, aes(x = Runtime, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + labs(x = "Runtime", y = "Residuals",
title = "Residuals vs. Runtime") + theme_classic()
ggplot(gam1_output, aes(x = IMDB_Rating, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + labs(x = "IMDB Rating", y = "Residuals",
title = "Residuals vs. IMDB Rating") + theme_classic()
ggplot(gam1_output, aes(x = No_of_Votes, y = resid)) + geom_point() + geom_smooth() +
geom_hline(yintercept = 0, color = "red") + labs(x = "Number of Votes", y = "Residuals",
title = "Residuals vs. Number of Votes") + theme_classic()
There does not appear to be any significant bias after analyzing the residuals. The trendline for Residuals vs. Runtime trends up at the end points since there are only a few cases with very short and very long runtimes.
# Eval Metrics from CV
full_lm_modelcv %>%
collect_metrics() # OLS
# Eval Metrics from CV
lasso_fit %>%
collect_metrics() # LASSO
# Eval Metrics from CV
gam1_output %>%
rmse(truth = Gross, estimate = .pred) # GAM
The GAM with Splines obtained using TidyModels performs the best with the lowest RMSE value (approx. 1.3 compared to approx. 1.4 for the others).
# Remove log transformation from Gross
imdb_class <- imdb_top_1000 %>%
cSplit("Genre", sep = ",", direction = "wide") %>%
mutate(Gross = Revenue - Budget)
runtime_clean <- imdb_top_1000$Runtime %>%
str_replace(" min", "") %>%
as.numeric()
imdb_class$Runtime <- runtime_clean
imdb_class <- imdb_class %>%
drop_na(Gross, Budget)
imdb_clean_class <- imdb_class %>%
mutate(success_ratio = Revenue/Budget) %>%
mutate(flop = as.factor(ifelse(success_ratio > 2, "FALSE", "TRUE"))) %>%
drop_na(flop, No_of_Votes, Runtime, IMDB_Rating, Meta_score, Genre_1)
# Model Specification
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(trees = 1000, # Number of trees
min_n = NULL,
probability = FALSE,
importance = 'impurity') %>%
set_mode('classification')
# Recipe
data_rec <- recipe(flop ~ No_of_Votes + Runtime + IMDB_Rating + Meta_score + Genre_1,
data = imdb_clean_class) %>%
step_naomit(flop, No_of_Votes, Runtime, IMDB_Rating, Meta_score, Genre_1)
# Create Workflows to Test Various Values for mtry
data_wf_mtry3 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 3)) %>%
add_recipe(data_rec)
data_wf_mtry4 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 4)) %>%
add_recipe(data_rec)
data_wf_mtry5 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 5)) %>%
add_recipe(data_rec)
# Fit Models (for each mtry value)
set.seed(123)
data_fit_mtry3 <- fit(data_wf_mtry3, data = imdb_clean_class)
set.seed(123)
data_fit_mtry4 <- fit(data_wf_mtry4, data = imdb_clean_class)
set.seed(123)
data_fit_mtry5 <- fit(data_wf_mtry5, data = imdb_clean_class)
# Custom Function to get OOB predictions, true observed outcomes and add a user-provided model label
rf_OOB_output <- function(fit_model, model_label, truth){
tibble(
.pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
flop = truth,
model = model_label
)
}
# Evaluate OOB Metrics
data_rf_OOB_output <- bind_rows(rf_OOB_output(data_fit_mtry3, 3, imdb_clean_class %>%
pull(flop)), rf_OOB_output(data_fit_mtry4, 4, imdb_clean_class %>%
pull(flop)), rf_OOB_output(data_fit_mtry5, 5, imdb_clean_class %>%
pull(flop)))
# OOB Accuracy for Each mtry Value
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = flop, estimate = .pred_class)
data_rf_OOB_output %>%
group_by(model) %>%
sensitivity(truth = flop, estimate = .pred_class)
# mtry = 3 and 4 tied for best sensitivity
# Accuracy vs. mtry Plot
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = flop, estimate = .pred_class) %>%
mutate(mtry = as.numeric(stringr::str_replace(model, "mtry", ""))) %>%
ggplot(aes(x = mtry, y = .estimate)) + geom_point() + geom_line() + theme_classic()
# Select mtry = 4 based on accuracy
# Confusion Matrix
rf_OOB_output(data_fit_mtry4, 4, imdb_clean_class %>%
pull(flop)) %>%
conf_mat(truth = flop, estimate = .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 428 120
## TRUE 47 33
# Sensitivity, Specificity, and Accuracy
rf_OOB_output(data_fit_mtry4, 4, imdb_clean_class %>%
pull(flop)) %>%
sensitivity(truth = flop, estimate = .pred_class)
rf_OOB_output(data_fit_mtry4, 4, imdb_clean_class %>%
pull(flop)) %>%
specificity(truth = flop, estimate = .pred_class)
rf_OOB_output(data_fit_mtry4, 4, imdb_clean_class %>%
pull(flop)) %>%
accuracy(truth = flop, estimate = .pred_class)
# Impurity
model_output <- data_fit_mtry4 %>%
extract_fit_engine()
model_output %>%
vip(num_features = 10) + theme_classic() #based on impurity, 10 meaning the top 10
model_output %>%
vip::vi() %>%
head()
model_output %>%
vip::vi() %>%
tail()
# Permutation
model_output2 <- data_wf_mtry4 %>%
update_model(rf_spec %>% set_args(importance = "permutation")) %>% #based on permutation
fit(data = imdb_clean_class) %>%
extract_fit_engine()
model_output2 %>%
vip(num_features = 10) + theme_classic()
model_output2 %>% vip::vi() %>% head()
model_output2 %>% vip::vi() %>% tail()
ggplot(imdb_clean_class, aes(x = flop, y = No_of_Votes)) + geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = Runtime)) + geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = IMDB_Rating)) + geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = Meta_score)) + geom_violin() + theme_classic()
set.seed(123)
# Logistic Regression Model Spec
logistic_spec <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
# Recipe
logistic_rec <- recipe(flop ~ No_of_Votes + Runtime + IMDB_Rating + Genre_1, data = imdb_clean_class)
# Workflow (Recipe + Model) for Full Log Model
log_wf <- workflow() %>%
add_recipe(logistic_rec) %>%
add_model(logistic_spec)
# Fit Model
log_fit <- fit(log_wf, data = imdb_clean_class)
tidy(log_fit)
log_fit %>%
tidy() %>%
mutate(OR = exp(estimate))
# Creation of CV Folds
data_cv10_class <- vfold_cv(imdb_clean_class, v = 10)
# Cross Validation Performed (10 folds)
log_modelcv <- fit_resamples(log_wf, resamples = data_cv10_class, metrics = metric_set(accuracy,
sens, yardstick::spec))
log_modelcv %>%
collect_metrics()
final_output <- log_fit %>%
predict(new_data = imdb_clean_class, type = "prob") %>%
bind_cols(imdb_clean_class)
final_output %>%
ggplot(aes(x = flop, y = .pred_TRUE)) + geom_boxplot()
# Use soft predictions
final_output %>%
roc_curve(flop, .pred_TRUE, event_level = "second") %>%
autoplot()
# Thresholds in terms of reference level
threshold_output <- final_output %>%
threshold_perf(truth = flop, estimate = .pred_FALSE, thresholds = seq(0, 1, by = 0.01))
# J-index v. Threshold for no flop
threshold_output %>%
filter(.metric == "j_index") %>%
ggplot(aes(x = .threshold, y = .estimate)) + geom_line() + labs(y = "J-index",
x = "threshold") + theme_classic()
threshold_output %>%
filter(.metric == "j_index") %>%
arrange(desc(.estimate))
# Distance vs. Threshold
threshold_output %>%
filter(.metric == "distance") %>%
ggplot(aes(x = .threshold, y = .estimate)) + geom_line() + labs(y = "Distance",
x = "threshold") + theme_classic()
threshold_output %>%
filter(.metric == "distance") %>%
arrange(.estimate)
# To determine final threshold
log_metrics <- metric_set(accuracy, sens, yardstick::spec)
# Compare Eval Metrics
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = 0.78)) %>%
log_metrics(truth = flop, estimate = .pred_class, event_level = "second")
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = 0.71)) %>%
log_metrics(truth = flop, estimate = .pred_class, event_level = "second")
# Compare Confusion Matrices
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = 0.78)) %>%
conf_mat(truth = flop, estimate = .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 234 18
## TRUE 241 135
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = 0.71)) %>%
conf_mat(truth = flop, estimate = .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 323 54
## TRUE 152 99
We chose the threshold of 0.78 for our final model because we are prioritizing sensitivity over accuracy and specificity.
predict(log_fit, new_data = data.frame(No_of_Votes = 10000, Runtime = 112, IMDB_Rating = 9.8,
Genre_1 = "Drama"), type = "prob")
We manually performed the hard predictions with threshold = .78. Since .pred_FALSE = 0.56 which is lower than our threshold of 0.78, we would predict this example movie would flop (flop = TRUE).
ggplot(imdb_clean, aes(x = Budget, y = Runtime)) + geom_point() + theme_classic()
imdb_clean %>%
filter(Budget > 1) %>%
ggplot(aes(x = Budget, y = Gross)) + geom_point() + labs(x = "Budget in USD",
y = "Gross Profit in USD (Log Scale)", title = "Preliminary Visualizations") +
theme_classic()
ggplot(imdb_clean, aes(x = No_of_Votes, y = Runtime)) + geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = Gross, y = No_of_Votes)) + geom_point() + theme_classic()
imdb_sub <- imdb_clean %>%
select(Budget, Gross)
set.seed(253)
# Data-specific function to cluster and calculate total within-cluster SS
imdb_cluster_ss <- function(k) {
# Perform clustering
kclust <- kmeans(scale(imdb_sub), centers = k)
# Return the total within-cluster sum of squares
return(kclust$tot.withinss)
}
tibble(k = 1:15, tot_wc_ss = purrr::map_dbl(1:15, imdb_cluster_ss)) %>%
ggplot(aes(x = k, y = tot_wc_ss)) + geom_point() + labs(x = "Number of clusters",
y = "Total within-cluster sum of squares") + theme_classic()
kclust_k8 <- kmeans(scale(imdb_sub), centers = 8)
kclust_k8$cluster # Display cluster assignments
## [1] 5 3 8 1 7 6 3 3 8 2 6 3 5 6 6 2 8 1 6 6 3 3 3 3 5 5 2 6 2 6 5 1 1 3 6 5 3
## [38] 1 5 5 6 8 2 1 8 1 3 1 2 2 5 5 1 5 1 7 6 2 2 5 3 3 3 6 1 1 3 1 2 2 1 2 1 2
## [75] 1 2 1 5 1 5 7 7 1 5 3 1 1 6 8 6 1 2 8 2 5 7 8 2 6 5 2 6 2 2 3 2 5 5 5 5 5
## [112] 1 7 7 7 7 1 1 2 3 6 3 3 2 7 1 2 8 3 6 1 7 8 1 3 3 1 1 5 3 6 3 1 6 3 6 1 1
## [149] 2 2 5 6 2 3 1 2 7 5 5 5 5 1 1 1 2 1 5 1 1 7 5 2 7 1 5 7 5 5 3 5 4 7 1 2 3
## [186] 2 6 8 7 3 8 1 1 3 2 6 1 3 3 1 3 6 6 2 5 8 1 2 1 6 8 6 6 5 5 1 5 1 5 3 7 2
## [223] 2 2 3 3 3 3 1 3 1 7 2 2 1 1 3 1 2 1 3 1 2 2 1 1 2 1 3 2 1 5 1 5 7 1 5 5 7
## [260] 7 1 7 1 7 7 7 5 1 3 5 8 2 8 1 5 5 3 1 5 1 8 1 2 5 6 5 8 1 5 2 6 1 6 6 1 5
## [297] 5 1 1 2 1 5 1 2 1 1 2 2 2 1 1 5 6 2 2 4 5 5 1 1 5 5 5 3 5 2 8 2 8 2 5 5 2
## [334] 8 2 6 2 3 8 2 2 3 5 1 8 1 2 2 6 8 2 2 2 8 1 2 3 2 1 2 5 2 2 6 6 8 2 5 1 1
## [371] 5 3 5 1 2 1 5 3 1 2 5 2 2 2 1 1 3 3 2 1 5 7 1 5 1 1 2 2 1 5 5 7 2 3 3 5 5
## [408] 6 3 5 8 5 5 8 7 2 6 6 8 5 3 3 8 3 3 2 8 2 5 1 1 2 1 8 5 3 6 5 2 2 8 6 2 8
## [445] 1 2 1 6 6 2 2 5 2 5 5 3 6 5 2 5 5 3 1 1 2 3 2 6 1 5 5 5 1 2 2 2 2 1 2 1 2
## [482] 1 1 1 1 1 1 1 5 2 5 3 5 5 2 1 5 8 3 3 2 3 1 2 8 5 1 1 6 1 2 1 2 2 5 6 3 5
## [519] 3 5 6 3 1 2 8 6 2 6 2 7 5 2 2 2 2 1 2 2 6 2 1 5 2 6 7 3 2 6 3 3 1 7 2 3 2
## [556] 2 3 2 2 1 1 1 2 1 5 1 3 7 5 5 1 1
imdb_clean <- imdb_clean %>%
mutate(kclust_8 = factor(kclust_k8$cluster))
# Visualize the cluster assignments on the original scatterplot
imdb_clean %>%
ggplot(aes(x = Budget, y = Gross, color = kclust_8)) + geom_point() + theme_classic()
# Count of Movies per Genre (Primary Genre)
imdb_clean %>%
count(Genre_1)
# Count of Movies per Genre (Secondary Genre)
imdb_clean %>%
count(Genre_2)
# Count of Movies per Genre (Overall Genre)
imdb_clean %>%
count(New_Genre)
# Genres vs Cluster
# How many of each Genre 1 in each cluster
imdb_clean %>%
group_by(kclust_8) %>%
count(Genre_1)
# How many of each Genre 2 in each cluster
imdb_clean %>%
group_by(kclust_8) %>%
count(Genre_2)
# How many movies in each cluster
imdb_clean %>%
count(kclust_8)
# Genre 1
imdb_clean %>%
ggplot(aes(x = kclust_8, fill = Genre_1)) + geom_bar(position = "fill") + labs(x = "Cluster") +
theme_classic()
# Genre 2
imdb_clean %>%
ggplot(aes(x = kclust_8, fill = Genre_2)) + geom_bar(position = "fill") + labs(x = "Cluster") +
theme_classic()
# Overall Genre
imdb_clean %>%
ggplot(aes(x = kclust_8, fill = New_Genre)) + geom_bar(position = "fill") + labs(x = "Cluster") +
theme_classic()
# Random subsample of 25 Movies
set.seed(253)
imdb_hc <- imdb_clean %>%
slice_sample(n = 25) %>%
filter(Budget != 0)
# Select the variables to be used in clustering
imdb_hc_sub <- imdb_hc %>%
select(Gross, Budget)
imdb_hc_full <- imdb_clean %>%
select(Gross, Budget) %>%
filter(Budget > 1)
# Summary statistics for the variables
summary(imdb_hc_sub)
## Gross Budget
## Min. :13.82 Min. : 560000
## 1st Qu.:15.91 1st Qu.: 3000000
## Median :17.15 Median : 19250000
## Mean :17.32 Mean : 40957083
## 3rd Qu.:18.89 3rd Qu.: 60000000
## Max. :20.31 Max. :190000000
# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(scale(imdb_hc_sub)) # Subset Distance Matrix
dist_mat_full <- dist(scale(imdb_hc_full)) # Full Data Distance Matrix
imdb_hc_avg <- hclust(dist_mat_scaled, method = "average") # Subset
imdb_full_avg <- hclust(dist_mat_full, method = "average") # Full Data
# Plot dendrogram on Subset
plot(imdb_hc_avg)
# Adding Genre Labels
plot(imdb_hc_avg, labels = imdb_hc$Genre_1, main = "Movie Clusters", xlab = NULL)
plot(imdb_hc_avg, labels = paste(imdb_hc$Genre_1, imdb_hc$Genre_2))
plot(imdb_hc_avg, main = "Visualizing Movie Clusters", labels = paste(imdb_hc$Genre_1,
imdb_hc$Genre_2, imdb_hc$Genre_3), hang = -1, cex = 1)
plot(imdb_hc_avg, labels = paste(imdb_hc$New_Genre))
imdb_clean_clust <- imdb_clean %>%
filter(Budget > 1) %>%
mutate(
hclust_num2 = factor(cutree(imdb_full_avg, k = 2)), # Cut into 2 clusters (k)
hclust_num4 = factor(cutree(imdb_full_avg, k = 4)), # Cut into 4 clusters (k)
hclust_num8 = factor(cutree(imdb_full_avg, k = 8)) # Cut into 8 clusters (k)
)
ggplot(imdb_clean_clust, aes(x = hclust_num2, fill = Genre_1)) + geom_bar(position = "fill") +
labs(x = "Cluster", y = "Proportion of Cluster", title = "Selecting Number of Clusters (k)") +
theme_classic() + theme(plot.title = element_text(size = 20, face = "bold"))
ggplot(imdb_clean_clust, aes(x = hclust_num4, fill = Genre_1)) + geom_bar(position = "fill") +
labs(x = "Cluster", y = "Proportion of Cluster", title = "Selecting Number of Clusters (k)") +
theme_classic() + theme(plot.title = element_text(size = 20, face = "bold"))
ggplot(imdb_clean_clust, aes(x = hclust_num8, fill = Genre_1)) + geom_bar(position = "fill") +
labs(x = "Cluster", y = "Proportion of Cluster", title = "Selecting Number of Clusters (k)") +
theme_classic() + theme(plot.title = element_text(size = 20, face = "bold"))
ggplot(imdb_clean_clust, aes(x = hclust_num2, fill = Genre_1)) + geom_bar(position = "fill") +
labs(x = "Cluster", y = "Proportion of Cluster", title = "Selecting Number of Clusters (k)",
fill = "Genre 1") + theme_classic() + theme(plot.title = element_text(size = 20,
face = "bold"))
ggplot(imdb_clean_clust, aes(x = hclust_num2, fill = New_Genre)) + geom_bar(position = "fill") +
labs(x = "Cluster") + theme_classic()
ggplot(imdb_clean_clust, aes(x = Budget, y = Gross, color = hclust_num2)) + geom_point() +
labs(x = "Budget in USD", y = "Gross Profit in USD (Log Scale)", title = "Visualizing Clusters: Gross Profit vs. Budget",
color = "Clusters") + theme_classic() + theme(plot.title = element_text(size = 20,
face = "bold"))
ggplot(imdb_clean_clust, aes(x = hclust_num2, y = Budget)) + geom_boxplot() + labs(x = "Cluster",
y = "Budget in USD", title = "Visualizing Clusters: Budget") + theme_classic() +
theme(plot.title = element_text(size = 20, face = "bold"))
ggplot(imdb_clean_clust, aes(x = hclust_num2, y = Gross)) + geom_boxplot() + labs(x = "Cluster",
y = "Gross Profit in USD (Log Scale)", title = "Visualizing Clusters: Gross Profit") +
theme_classic() + theme(plot.title = element_text(size = 20, face = "bold"))
imdb_clean_clust %>%
count(hclust_num2)
imdb_clean_clust %>%
group_by(hclust_num2) %>%
summarize(mean(Gross), sd(Gross), min(Gross), max(Gross), mean((Budget/1e+05)),
sd((Budget/1e+05)), min((Budget/1e+05)), max((Budget/1e+05)))